** This do file recreates the results summarized in Figure 3 of the paper

* The code uses aggregate census data calculated from the IPUMS International 
* Census Microdata Repository at https://international.ipums.org/international/

* set directory to folder with replication data
cd "~\Replication\"

*** Aggregating responses from online surveys

** brazil 2020
** blackbox
* sex
use br_blackbox.dta, clear
drop gender
rename p_sexo gender
egen svy_total = count(finished)
collapse (count) svy_cases=finished (first) svy_total, by(gender)
gen sex = "Hombre" if gender == 1
replace sex = "Mujer" if gender == 2
gen svy_pct = svy_cases / svy_total * 100

rename sex category
drop gender
rename svy_pct blackbox
gen country = "brazil"
gen variable = "sex"
gen year = "2020"
save "fig3_blackbox_demo.dta", replace

* edu
use br_blackbox.dta, clear
drop edr
recode br_education_level (1 2 3=1)(4=2)(5 6 7 8=3), gen(edr)
lab define ed_level 0 "None" 1 "Primary" 2 "Secondary" 3 "Post-secondary" 4 "DK/NA"
lab values edr ed_level

egen svy_total = count(finished)
collapse (count) svy_cases=finished (first) svy_total, by(edr)
lab values edr ed_level
gen svy_pct = svy_cases / svy_total * 100

decode edr, gen(category)
drop edr
rename svy_pct blackbox
gen country = "brazil"
gen variable = "educ"
gen year = "2020"
append using "fig3_blackbox_demo.dta"
save "fig3_blackbox_demo.dta", replace

* prov
use br_blackbox.dta, clear
egen svy_total = count(finished)
collapse (count) svy_cases=finished (first) svy_total, by(int_estado)
gen svy_pct = svy_cases / svy_total * 100

decode int_estado,  gen(category)
drop int_estado
rename svy_pct blackbox
gen country = "brazil"
gen variable = "prov"
gen year = "2020"
append using "fig3_blackbox_demo.dta"
save "fig3_blackbox_demo.dta", replace

* age
use br_blackbox.dta, clear
drop edad
gen edad = 1 if panelistage <= 25
replace edad = 2 if panelistage > 25 & panelistage <= 35
replace edad = 3 if panelistage > 35 & panelistage <= 45
replace edad = 4 if panelistage > 45 & panelistage <= 55
replace edad = 5 if panelistage > 55 & panelistage <= 65
replace edad = 6 if panelistage > 65 
lab value edad edad_lbl

egen svy_total = count(finished)
collapse (count) svy_cases=finished (first) svy_total, by(edad)
rename edad age
gen svy_pct = svy_cases / svy_total * 100

decode age, gen(category)
drop age
rename svy_pct blackbox
gen country = "brazil"
gen variable = "age"
gen year = "2020"
append using "fig3_blackbox_demo.dta"
save "fig3_blackbox_demo.dta", replace

* household size
use br_blackbox.dta, clear
recode q144 (4=1)(5=2)(7=3)(8=4)(9=5)(10=6)(11=7)(12=8)(13=9)(14=10) , gen(house_size) 
svy: tab house_size
gen blackbox = 1
drop if house_size == .
collapse (percent) blackbox, by(house_size)
gen country = "brazil"
gen comparison_to = "pnadc2017"
tostring house_size, replace
rename house_size category
gen variable = "house_size"
save "fig3_blackbox_benchmark.dta", replace

* number of rooms
use br_blackbox.dta, clear
rename q147 num_rooms
svy: tab num_rooms
gen blackbox = 1
drop if num_rooms == .
collapse (percent) blackbox, by(num_rooms)
gen country = "brazil"
gen comparison_to = "pnadc2017"
tostring num_rooms, replace
rename num_rooms category
gen variable = "num_rooms"
append using "fig3_blackbox_benchmark.dta"
save "fig3_blackbox_benchmark.dta", replace

* home ownership
use br_blackbox.dta, clear
gen blackbox = 1
drop if q145 == .
label define q145 1 "Próprio de algum morador - já pago" 2 "Próprio de algum morador - ainda pagando" 3 "Alugado" 4 "Cedido por empregador" 5 "Cedido por familiar" 6 "Cedido de outra forma" 7 "Outra condição"
label values q145 q145
svy: tab q145
collapse (percent) blackbox, by(q145)
gen country = "brazil"
gen comparison_to = "pnadc2017"
decode q145, gen(q145_str)
rename q145_str category
drop q145
gen variable = "homeowner"
append using "fig3_blackbox_benchmark.dta"
save "fig3_blackbox_benchmark.dta", replace

* employment
use br_blackbox.dta, clear
gen employed = 1 if q146 == 1
replace employed = 0 if inlist(q146, 2, 3)
label define employed 1 "Sim" 0 "Não"
label values employed employed
gen blackbox = 1
drop if employed == .
svy: tab employed
collapse (percent) blackbox, by(employed)
gen country = "brazil"
gen comparison_to = "pnadc2017"
decode employed, gen(employed_str)
rename employed_str category
drop employed
gen variable = "employment"
append using "fig3_blackbox_benchmark.dta"
save "fig3_blackbox_benchmark.dta", replace

** matching
* sex
use br_matching.dta, clear
drop gender
rename p_sexo gender
egen svy_total = count(finished)
collapse (count) svy_cases=finished (first) svy_total, by(gender)
gen sex = "Hombre" if gender == 1
replace sex = "Mujer" if gender == 2
gen svy_pct = svy_cases / svy_total * 100

rename sex category
drop gender
rename svy_pct matching
gen country = "brazil"
gen variable = "sex"
gen year = "2020"
save "fig3_matching_demo.dta", replace

* edu
use br_matching.dta, clear
drop edr
recode br_education_level (1 2 3=1)(4=2)(5 6 7 8=3), gen(edr)
lab define ed_level 0 "None" 1 "Primary" 2 "Secondary" 3 "Post-secondary" 4 "DK/NA"
lab values edr ed_level

egen svy_total = count(finished)
collapse (count) svy_cases=finished (first) svy_total, by(edr)
lab values edr ed_level
gen svy_pct = svy_cases / svy_total * 100

decode edr, gen(category)
drop edr
rename svy_pct matching
gen country = "brazil"
gen variable = "educ"
gen year = "2020"
append using "fig3_matching_demo.dta"
save "fig3_matching_demo.dta", replace

* prov
use br_matching.dta, clear
egen svy_total = count(finished)
collapse (count) svy_cases=finished (first) svy_total, by(int_estado)
gen svy_pct = svy_cases / svy_total * 100

decode int_estado,  gen(category)
drop int_estado
rename svy_pct matching
gen country = "brazil"
gen variable = "prov"
gen year = "2020"
append using "fig3_matching_demo.dta"
save "fig3_matching_demo.dta", replace

* age
use br_matching.dta, clear
drop edad
gen edad = 1 if panelistage <= 25
replace edad = 2 if panelistage > 25 & panelistage <= 35
replace edad = 3 if panelistage > 35 & panelistage <= 45
replace edad = 4 if panelistage > 45 & panelistage <= 55
replace edad = 5 if panelistage > 55 & panelistage <= 65
replace edad = 6 if panelistage > 65 
lab value edad edad_lbl

egen svy_total = count(finished)
collapse (count) svy_cases=finished (first) svy_total, by(edad)
rename edad age
gen svy_pct = svy_cases / svy_total * 100

decode age, gen(category)
drop age
rename svy_pct matching
gen country = "brazil"
gen variable = "age"
gen year = "2020"
append using "fig3_matching_demo.dta"
save "fig3_matching_demo.dta", replace

* household size
use br_matching.dta, clear
recode q144 (4=1)(5=2)(7=3)(8=4)(9=5)(10=6)(11=7)(12=8)(13=9)(14=10) , gen(house_size) 
svy: tab house_size
gen matching = 1
drop if house_size == .
collapse (percent) matching, by(house_size)
gen country = "brazil"
gen comparison_to = "pnadc2017"
tostring house_size, replace
rename house_size category
gen variable = "house_size"
save "fig3_matching_benchmark.dta", replace

* number of rooms
use br_matching.dta, clear
rename q147 num_rooms
svy: tab num_rooms
gen matching = 1
drop if num_rooms == .
collapse (percent) matching, by(num_rooms)
gen country = "brazil"
gen comparison_to = "pnadc2017"
tostring num_rooms, replace
rename num_rooms category
gen variable = "num_rooms"
append using "fig3_matching_benchmark.dta"
save "fig3_matching_benchmark.dta", replace

* home ownership
use br_matching.dta, clear
gen matching = 1
drop if q145 == .
label define q145 1 "Próprio de algum morador - já pago" 2 "Próprio de algum morador - ainda pagando" 3 "Alugado" 4 "Cedido por empregador" 5 "Cedido por familiar" 6 "Cedido de outra forma" 7 "Outra condição"
label values q145 q145
svy: tab q145
collapse (percent) matching, by(q145)
gen country = "brazil"
gen comparison_to = "pnadc2017"
decode q145, gen(q145_str)
rename q145_str category
drop q145
gen variable = "homeowner"
append using "fig3_matching_benchmark.dta"
save "fig3_matching_benchmark.dta", replace

* employment
use br_matching.dta, clear
gen employed = 1 if q146 == 1
replace employed = 0 if inlist(q146, 2, 3)
label define employed 1 "Sim" 0 "Não"
label values employed employed
gen matching = 1
drop if employed == .
svy: tab employed
collapse (percent) matching, by(employed)
gen country = "brazil"
gen comparison_to = "pnadc2017"
decode employed, gen(employed_str)
rename employed_str category
drop employed
gen variable = "employment"
append using "fig3_matching_benchmark.dta"
save "fig3_matching_benchmark.dta", replace



** argentina 2020
** blackbox
* sex
use ar_blackbox.dta, clear
drop gender
rename p_sexo gender
egen svy_total = count(finished)
collapse (count) svy_cases=finished (first) svy_total, by(gender)
gen sex = "Hombre" if gender == 1
replace sex = "Mujer" if gender == 2
gen svy_pct = svy_cases / svy_total * 100

rename sex category
drop gender
rename svy_pct blackbox
gen country = "argentina"
gen variable = "sex"
gen year = "2020"
append using "fig3_blackbox_demo.dta"
save "fig3_blackbox_demo.dta", replace

* education
use ar_blackbox.dta, clear
drop edr
recode ar_education_level (1=0)(2 3=1)(4 5=2)(6 7 8 9 10=3)(11=4), gen(edr)
lab define ed_level 0 "None" 1 "Primary" 2 "Secondary" 3 "Post-secondary" 4 "DK/NA"
lab values edr ed_level

egen svy_total = count(finished)
collapse (count) svy_cases=finished (first) svy_total, by(edr)
lab values edr ed_level
gen svy_pct = svy_cases / svy_total * 100

decode edr, gen(category)
drop edr
rename svy_pct blackbox
gen country = "argentina"
gen variable = "educ"
gen year = "2020"
append using "fig3_blackbox_demo.dta"
save "fig3_blackbox_demo.dta", replace

* age 
use ar_blackbox.dta, clear
drop edad
gen edad = 1 if panelistage <= 25
replace edad = 2 if panelistage > 25 & panelistage <= 35
replace edad = 3 if panelistage > 35 & panelistage <= 45
replace edad = 4 if panelistage > 45 & panelistage <= 55
replace edad = 5 if panelistage > 55 & panelistage <= 65
replace edad = 6 if panelistage > 65 
lab value edad edad_lbl

egen svy_total = count(finished)
collapse (count) svy_cases=finished (first) svy_total, by(edad)
rename edad age
gen svy_pct = svy_cases / svy_total * 100

decode age, gen(category)
drop age
rename svy_pct blackbox
gen country = "argentina"
gen variable = "age"
gen year = "2020"
append using "fig3_blackbox_demo.dta"
save "fig3_blackbox_demo.dta", replace

* prov
use ar_blackbox.dta, clear
egen svy_total = count(finished)
collapse (count) svy_cases=finished (first) svy_total, by(ar_provincia)
gen svy_pct = svy_cases / svy_total * 100

decode ar_provincia,  gen(category)
drop ar_provincia
rename svy_pct blackbox
gen country = "argentina"
gen variable = "prov"
gen year = "2020"
append using "fig3_blackbox_demo.dta"
save "fig3_blackbox_demo.dta", replace

* household size
use ar_blackbox.dta, clear
recode q402 (1=1)(4=2)(5=3)(6=4)(7=5)(8=6)(9=7)(10=8)(11=9)(12=10) , gen(house_size) 
svy: tab house_size
gen blackbox = 1
drop if house_size == .
collapse (percent) blackbox, by(house_size)
gen country = "argentina"
gen comparison_to = "eph2018"
tostring house_size, replace
rename house_size category
gen variable = "house_size"
append using "fig3_blackbox_benchmark.dta"
save "fig3_blackbox_benchmark.dta", replace

* number of rooms
use ar_blackbox.dta, clear
recode q408 (4=1)(7=2)(8=3)(9=4)(10=5)(11=6)(12=7)(13=8), gen(num_rooms)
svy: tab num_rooms
gen blackbox = 1
drop if num_rooms == .
collapse (percent) blackbox, by(num_rooms)
gen country = "argentina"
gen comparison_to = "eph2018"
tostring num_rooms, replace
rename num_rooms category
gen variable = "num_rooms"
append using "fig3_blackbox_benchmark.dta"
save "fig3_blackbox_benchmark.dta", replace

* employment (1 = Yes, 2 = No)
use ar_blackbox.dta, clear
rename q398 trabajo_sp
label define trabajo_sp 1 "Si" 2 "No"
label values trabajo_sp trabajo_sp
* (1=Yes, 2=No)
svy: tab trabajo_sp
gen blackbox = 1
drop if trabajo_sp == .
collapse (percent) blackbox, by(trabajo_sp)
gen country = "argentina"
gen comparison_to = "eph2018"
decode trabajo_sp, gen(trabajo_sp_str)
rename trabajo_sp_str category
drop trabajo_sp
gen variable = "employment"
append using "fig3_blackbox_benchmark.dta"
save "fig3_blackbox_benchmark.dta", replace


* pension
use ar_blackbox.dta, clear
gen pension = 1 if inlist(q409, "1", "1,2", "1,2,3", "1,3")
replace pension = 0 if !inlist(q409, "1", "1,2", "1,2,3", "1,3")
label define pension 1 "jubilación o pension" 0 "no jubilación o pension"
label values pension pension
svy: tab pension
gen blackbox = 1
drop if pension == .
collapse (percent) blackbox, by(pension)
gen country = "argentina"
gen comparison_to = "eph2018"
decode pension, gen(pension_str)
rename pension_str category
drop pension
gen variable = "govassist_retired"
append using "fig3_blackbox_benchmark.dta"
save "fig3_blackbox_benchmark.dta", replace

* pension
use ar_blackbox.dta, clear
gen subsidy = 1 if inlist(q409, "1,2", "1,2,3", "2", "2,3")
replace subsidy = 0 if !inlist(q409, "1,2", "1,2,3", "2", "2,3")
label define subsidy 1 "subsidio o ayuda social " 0 "no subsidio o ayuda social"
label values subsidy subsidy
svy: tab subsidy
gen blackbox = 1
drop if subsidy == .
collapse (percent) blackbox, by(subsidy)
gen country = "argentina"
gen comparison_to = "eph2018"
decode subsidy, gen(subsidy_str)
rename subsidy_str category
drop subsidy
gen variable = "govassist_subsidy"
append using "fig3_blackbox_benchmark.dta"
save "fig3_blackbox_benchmark.dta", replace

* good from govt or church
use ar_blackbox.dta, clear
gen social_goods = 1 if inlist(q409, "1,2,3", "1,3", "2,3", "3")
replace social_goods = 0 if !inlist(q409, "1,2,3", "1,3", "2,3", "3")
label define social_goods 1 "mercaderías, ropa, alimentos" 0 "no mercaderías, ropa, alimentos"
label values social_goods social_goods
svy: tab social_goods
gen blackbox = 1
drop if social_goods == .
collapse (percent) blackbox, by(social_goods)
gen country = "argentina"
gen comparison_to = "eph2018"
decode social_goods, gen(social_goods_str)
rename social_goods_str category
drop social_goods
gen variable = "govassist_goods"
append using "fig3_blackbox_benchmark.dta"
save "fig3_blackbox_benchmark.dta", replace

** matching
* sex
use ar_matching.dta, clear
drop gender
rename p_sexo gender
egen svy_total = count(finished)
collapse (count) svy_cases=finished (first) svy_total, by(gender)
gen sex = "Hombre" if gender == 1
replace sex = "Mujer" if gender == 2
gen svy_pct = svy_cases / svy_total * 100

rename sex category
drop gender
rename svy_pct matching
gen country = "argentina"
gen variable = "sex"
gen year = "2020"
append using "fig3_matching_demo.dta"
save "fig3_matching_demo.dta", replace

* education
use ar_matching.dta, clear
drop edr
recode ar_education_level (1=0)(2 3=1)(4 5=2)(6 7 8 9 10=3)(11=4), gen(edr)
lab define ed_level 0 "None" 1 "Primary" 2 "Secondary" 3 "Post-secondary" 4 "DK/NA"
lab values edr ed_level

egen svy_total = count(finished)
collapse (count) svy_cases=finished (first) svy_total, by(edr)
lab values edr ed_level
gen svy_pct = svy_cases / svy_total * 100

decode edr, gen(category)
drop edr
rename svy_pct matching
gen country = "argentina"
gen variable = "educ"
gen year = "2020"
append using "fig3_matching_demo.dta"
save "fig3_matching_demo.dta", replace

* age 
use ar_matching.dta, clear
drop edad
gen edad = 1 if panelistage <= 25
replace edad = 2 if panelistage > 25 & panelistage <= 35
replace edad = 3 if panelistage > 35 & panelistage <= 45
replace edad = 4 if panelistage > 45 & panelistage <= 55
replace edad = 5 if panelistage > 55 & panelistage <= 65
replace edad = 6 if panelistage > 65 
lab value edad edad_lbl

egen svy_total = count(finished)
collapse (count) svy_cases=finished (first) svy_total, by(edad)
rename edad age
gen svy_pct = svy_cases / svy_total * 100

decode age, gen(category)
drop age
rename svy_pct matching
gen country = "argentina"
gen variable = "age"
gen year = "2020"
append using "fig3_matching_demo.dta"
save "fig3_matching_demo.dta", replace

* prov
use ar_matching.dta, clear
egen svy_total = count(finished)
collapse (count) svy_cases=finished (first) svy_total, by(ar_provincia)
gen svy_pct = svy_cases / svy_total * 100

decode ar_provincia,  gen(category)
drop ar_provincia
rename svy_pct matching
gen country = "argentina"
gen variable = "prov"
gen year = "2020"
append using "fig3_matching_demo.dta"
save "fig3_matching_demo.dta", replace

* household size
use ar_matching.dta, clear
recode q402 (1=1)(4=2)(5=3)(6=4)(7=5)(8=6)(9=7)(10=8)(11=9)(12=10) , gen(house_size) 
svy: tab house_size
gen matching = 1
drop if house_size == .
collapse (percent) matching, by(house_size)
gen country = "argentina"
gen comparison_to = "eph2018"
tostring house_size, replace
rename house_size category
gen variable = "house_size"
append using "fig3_matching_benchmark.dta"
save "fig3_matching_benchmark.dta", replace

* number of rooms
use ar_matching.dta, clear
recode q408 (4=1)(7=2)(8=3)(9=4)(10=5)(11=6)(12=7)(13=8), gen(num_rooms)
svy: tab num_rooms
gen matching = 1
drop if num_rooms == .
collapse (percent) matching, by(num_rooms)
gen country = "argentina"
gen comparison_to = "eph2018"
tostring num_rooms, replace
rename num_rooms category
gen variable = "num_rooms"
append using "fig3_matching_benchmark.dta"
save "fig3_matching_benchmark.dta", replace

* employment (1 = Yes, 2 = No)
use ar_matching.dta, clear
rename q398 trabajo_sp
label define trabajo_sp 1 "Si" 2 "No"
label values trabajo_sp trabajo_sp
* (1=Yes, 2=No)
svy: tab trabajo_sp
gen matching = 1
drop if trabajo_sp == .
collapse (percent) matching, by(trabajo_sp)
gen country = "argentina"
gen comparison_to = "eph2018"
decode trabajo_sp, gen(trabajo_sp_str)
rename trabajo_sp_str category
drop trabajo_sp
gen variable = "employment"
append using "fig3_matching_benchmark.dta"
save "fig3_matching_benchmark.dta", replace


* pension
use ar_matching.dta, clear
gen pension = 1 if inlist(q409, "1", "1,2", "1,2,3", "1,3")
replace pension = 0 if !inlist(q409, "1", "1,2", "1,2,3", "1,3")
label define pension 1 "jubilación o pension" 0 "no jubilación o pension"
label values pension pension
svy: tab pension
gen matching = 1
drop if pension == .
collapse (percent) matching, by(pension)
gen country = "argentina"
gen comparison_to = "eph2018"
decode pension, gen(pension_str)
rename pension_str category
drop pension
gen variable = "govassist_retired"
append using "fig3_matching_benchmark.dta"
save "fig3_matching_benchmark.dta", replace

* pension
use ar_matching.dta, clear
gen subsidy = 1 if inlist(q409, "1,2", "1,2,3", "2", "2,3")
replace subsidy = 0 if !inlist(q409, "1,2", "1,2,3", "2", "2,3")
label define subsidy 1 "subsidio o ayuda social " 0 "no subsidio o ayuda social"
label values subsidy subsidy
svy: tab subsidy
gen matching = 1
drop if subsidy == .
collapse (percent) matching, by(subsidy)
gen country = "argentina"
gen comparison_to = "eph2018"
decode subsidy, gen(subsidy_str)
rename subsidy_str category
drop subsidy
gen variable = "govassist_subsidy"
append using "fig3_matching_benchmark.dta"
save "fig3_matching_benchmark.dta", replace

* good from govt or church
use ar_matching.dta, clear
gen social_goods = 1 if inlist(q409, "1,2,3", "1,3", "2,3", "3")
replace social_goods = 0 if !inlist(q409, "1,2,3", "1,3", "2,3", "3")
label define social_goods 1 "mercaderías, ropa, alimentos" 0 "no mercaderías, ropa, alimentos"
label values social_goods social_goods
svy: tab social_goods
gen matching = 1
drop if social_goods == .
collapse (percent) matching, by(social_goods)
gen country = "argentina"
gen comparison_to = "eph2018"
decode social_goods, gen(social_goods_str)
rename social_goods_str category
drop social_goods
gen variable = "govassist_goods"
append using "fig3_matching_benchmark.dta"
save "fig3_matching_benchmark.dta", replace



** mexico 2020
** blackbox
* sex
use mx_blackbox.dta, clear
drop gender
rename p_sexo gender
egen svy_total = count(finished)
collapse (count) svy_cases=finished (first) svy_total, by(gender)
gen sex = "Hombre" if gender == 1
replace sex = "Mujer" if gender == 2
gen svy_pct = svy_cases / svy_total * 100

rename sex category
drop gender
rename svy_pct blackbox
gen country = "mexico"
gen variable = "sex"
gen year = "2020"
append using "fig3_blackbox_demo.dta"
save "fig3_blackbox_demo.dta", replace

* education
use mx_blackbox.dta, clear
drop edr
recode mx_education_level (1=0)(2 3=1)(4 5 6 7 8 9=2)(10 11 12 13=3), gen(edr)
lab define ed_level 0 "None" 1 "Primary" 2 "Secondary" 3 "Post-secondary" 4 "DK/NA"
lab values edr ed_level

egen svy_total = count(finished)
collapse (count) svy_cases=finished (first) svy_total, by(edr)
lab values edr ed_level
gen svy_pct = svy_cases / svy_total * 100

decode edr, gen(category)
drop edr
rename svy_pct blackbox
gen country = "mexico"
gen variable = "educ"
gen year = "2020"
append using "fig3_blackbox_demo.dta"
save "fig3_blackbox_demo.dta", replace

* age 
use mx_blackbox.dta, clear
drop edad
gen edad = 1 if panelistage <= 25
replace edad = 2 if panelistage > 25 & panelistage <= 35
replace edad = 3 if panelistage > 35 & panelistage <= 45
replace edad = 4 if panelistage > 45 & panelistage <= 55
replace edad = 5 if panelistage > 55 & panelistage <= 65
replace edad = 6 if panelistage > 65 
lab value edad edad_lbl

egen svy_total = count(finished)
collapse (count) svy_cases=finished (first) svy_total, by(edad)
rename edad age
gen svy_pct = svy_cases / svy_total * 100

decode age, gen(category)
drop age
rename svy_pct blackbox
gen country = "mexico"
gen variable = "age"
gen year = "2020"
append using "fig3_blackbox_demo.dta"
save "fig3_blackbox_demo.dta", replace

* prov
use mx_blackbox.dta, clear
egen svy_total = count(finished)
collapse (count) svy_cases=finished (first) svy_total, by(mx_int_estado)
gen svy_pct = svy_cases / svy_total * 100

decode mx_int_estado,  gen(category)
replace category = proper(category)
replace category = "Ciudad de México" if category == "Ciudad De MÉXico"
replace category = "Michoacán" if category == "MichoacÁN"
replace category = "México" if category == "MÉXico"
replace category = "Nuevo León" if category == "Nuevo LeÓN"
replace category = "Querétaro" if category == "QuerÉTaro"
replace category = "San Luis Potosí" if category == "San Luis PotosÍ"
replace category = "Yucatán" if category == "YucatÁN"
drop mx_int_estado
rename svy_pct blackbox
gen country = "mexico"
gen variable = "prov"
gen year = "2020"
append using "fig3_blackbox_demo.dta"
replace country = country + " " + year
destring year, replace
save "fig3_blackbox_demo.dta", replace

* household size
use mx_blackbox.dta, clear
recode q138 (1=1)(4=2)(5=3)(6=4)(7=5)(8=6)(9=7)(10=8)(11=9)(12=10) , gen(house_size) 
svy: tab house_size
gen blackbox = 1
drop if house_size == .
collapse (percent) blackbox, by(house_size)
gen country = "mexico"
gen comparison_to = "enh2017"
tostring house_size, replace
rename house_size category
gen variable = "house_size"
append using "fig3_blackbox_benchmark.dta"
save "fig3_blackbox_benchmark.dta", replace

* number of rooms
use mx_blackbox.dta, clear
recode v273 (4=1)(7=2)(8=3)(9=4)(10=5)(11=6)(12=7)(13=8), gen(num_rooms)
svy: tab num_rooms
gen blackbox = 1
drop if num_rooms == .
collapse (percent) blackbox, by(num_rooms)
gen country = "mexico"
gen comparison_to = "enh2017"
tostring num_rooms, replace
rename num_rooms category
gen variable = "num_rooms"
append using "fig3_blackbox_benchmark.dta"
save "fig3_blackbox_benchmark.dta", replace

* home ownership 
use mx_blackbox.dta, clear
gen blackbox = 1
drop if q140 == .
label define q140 1 "es rentada" 2 "es prestada" 3 "es propia pero la están pagando" 4 "es propia" 5 "esta en litigio" 6 "otra situación"
label values q140 q140
svy: tab q140
collapse (percent) blackbox, by(q140)
gen country = "mexico"
gen comparison_to = "enh2017"
decode q140, gen(q140_str)
rename q140_str category
drop q140
gen variable = "homeowner"
append using "fig3_blackbox_benchmark.dta"
save "fig3_blackbox_benchmark.dta", replace

* employment (1 = Yes, 2 = No)
use mx_blackbox.dta, clear
rename q141 trabajo_sp
label define trabajo_sp 1 "Si" 2 "No"
label values trabajo_sp trabajo_sp
* (1=Yes, 2=No)
svy: tab trabajo_sp
gen blackbox = 1
drop if trabajo_sp == .
collapse (percent) blackbox, by(trabajo_sp)
gen country = "mexico"
gen comparison_to = "enh2017"
decode trabajo_sp, gen(trabajo_sp_str)
rename trabajo_sp_str category
drop trabajo_sp
gen variable = "employment"
append using "fig3_blackbox_benchmark.dta"
save "fig3_blackbox_benchmark.dta", replace


** matching
* sex
use mx_matching.dta, clear
drop gender
rename p_sexo gender
egen svy_total = count(finished)
collapse (count) svy_cases=finished (first) svy_total, by(gender)
gen sex = "Hombre" if gender == 1
replace sex = "Mujer" if gender == 2
gen svy_pct = svy_cases / svy_total * 100

rename sex category
drop gender
rename svy_pct matching
gen country = "mexico"
gen variable = "sex"
gen year = "2020"
append using "fig3_matching_demo.dta"
save "fig3_matching_demo.dta", replace

* education
use mx_matching.dta, clear
drop edr
recode mx_education_level (1=0)(2 3=1)(4 5 6 7 8 9=2)(10 11 12 13=3), gen(edr)
lab define ed_level 0 "None" 1 "Primary" 2 "Secondary" 3 "Post-secondary" 4 "DK/NA"
lab values edr ed_level

egen svy_total = count(finished)
collapse (count) svy_cases=finished (first) svy_total, by(edr)
lab values edr ed_level
gen svy_pct = svy_cases / svy_total * 100

decode edr, gen(category)
drop edr
rename svy_pct matching
gen country = "mexico"
gen variable = "educ"
gen year = "2020"
append using "fig3_matching_demo.dta"
save "fig3_matching_demo.dta", replace

* age 
use mx_matching.dta, clear
drop edad
gen edad = 1 if panelistage <= 25
replace edad = 2 if panelistage > 25 & panelistage <= 35
replace edad = 3 if panelistage > 35 & panelistage <= 45
replace edad = 4 if panelistage > 45 & panelistage <= 55
replace edad = 5 if panelistage > 55 & panelistage <= 65
replace edad = 6 if panelistage > 65 
lab value edad edad_lbl

egen svy_total = count(finished)
collapse (count) svy_cases=finished (first) svy_total, by(edad)
rename edad age
gen svy_pct = svy_cases / svy_total * 100

decode age, gen(category)
drop age
rename svy_pct matching
gen country = "mexico"
gen variable = "age"
gen year = "2020"
append using "fig3_matching_demo.dta"
save "fig3_matching_demo.dta", replace

* prov
use mx_matching.dta, clear
egen svy_total = count(finished)
collapse (count) svy_cases=finished (first) svy_total, by(mx_int_estado)
gen svy_pct = svy_cases / svy_total * 100

decode mx_int_estado,  gen(category)
replace category = proper(category)
replace category = "Ciudad de México" if category == "Ciudad De MÉXico"
replace category = "Michoacán" if category == "MichoacÁN"
replace category = "México" if category == "MÉXico"
replace category = "Nuevo León" if category == "Nuevo LeÓN"
replace category = "Querétaro" if category == "QuerÉTaro"
replace category = "San Luis Potosí" if category == "San Luis PotosÍ"
replace category = "Yucatán" if category == "YucatÁN"
drop mx_int_estado
rename svy_pct matching
gen country = "mexico"
gen variable = "prov"
gen year = "2020"
append using "fig3_matching_demo.dta"
replace country = country + " " + year
destring year, replace
save "fig3_matching_demo.dta", replace

* household size
use mx_matching.dta, clear
recode q138 (1=1)(4=2)(5=3)(6=4)(7=5)(8=6)(9=7)(10=8)(11=9)(12=10) , gen(house_size) 
svy: tab house_size
gen matching = 1
drop if house_size == .
collapse (percent) matching, by(house_size)
gen country = "mexico"
gen comparison_to = "enh2017"
tostring house_size, replace
rename house_size category
gen variable = "house_size"
append using "fig3_matching_benchmark.dta"
save "fig3_matching_benchmark.dta", replace

* number of rooms
use mx_matching.dta, clear
recode v273 (4=1)(7=2)(8=3)(9=4)(10=5)(11=6)(12=7)(13=8), gen(num_rooms)
svy: tab num_rooms
gen matching = 1
drop if num_rooms == .
collapse (percent) matching, by(num_rooms)
gen country = "mexico"
gen comparison_to = "enh2017"
tostring num_rooms, replace
rename num_rooms category
gen variable = "num_rooms"
append using "fig3_matching_benchmark.dta"
save "fig3_matching_benchmark.dta", replace

* home ownership 
use mx_matching.dta, clear
gen matching = 1
drop if q140 == .
label define q140 1 "es rentada" 2 "es prestada" 3 "es propia pero la están pagando" 4 "es propia" 5 "esta en litigio" 6 "otra situación"
label values q140 q140
svy: tab q140
collapse (percent) matching, by(q140)
gen country = "mexico"
gen comparison_to = "enh2017"
decode q140, gen(q140_str)
rename q140_str category
drop q140
gen variable = "homeowner"
append using "fig3_matching_benchmark.dta"
save "fig3_matching_benchmark.dta", replace

* employment (1 = Yes, 2 = No)
use mx_matching.dta, clear
rename q141 trabajo_sp
label define trabajo_sp 1 "Si" 2 "No"
label values trabajo_sp trabajo_sp
* (1=Yes, 2=No)
svy: tab trabajo_sp
gen matching = 1
drop if trabajo_sp == .
collapse (percent) matching, by(trabajo_sp)
gen country = "mexico"
gen comparison_to = "enh2017"
decode trabajo_sp, gen(trabajo_sp_str)
rename trabajo_sp_str category
drop trabajo_sp
gen variable = "employment"
append using "fig3_matching_benchmark.dta"
save "fig3_matching_benchmark.dta", replace


*** COMPARISONS

*** Merge aggregate census data and calculate differences (MAEs)
import delimited "census_aggregates.csv", clear 
keep if year == 2020
merge 1:1 country variable category using fig3_blackbox_demo.dta
* drop missing categories from online surveys and assign blackbox percentage to 0 for unmatched census aggregates
replace blackbox = 0 if _merge == 1
drop _merge
merge 1:1 country variable category using fig3_matching_demo.dta
* drop missing categories from online surveys and assign blackbox percentage to 0 for unmatched census aggregates
replace matching = 0 if _merge == 1
drop _merge

* create pooled category
expand 2, gen(duplicate)
replace country = "Pooled" if duplicate == 1
replace country = strproper(country)

* calculate absolute differences
gen abs_error_netquest = abs(population - blackbox)
gen abs_error_matching = abs(population - matching)

* keep only response category that is modal response in benchamrk data
keep if modal == 1


* bar graph
graph bar abs_error_netquest abs_error_matching, over(country, label(angle(45))) ///
scheme(plotplain) fxsize(80) ///
ytitle("Mean absolute error") ylab(0 "0" 2 "2" 4 "4" 6 "6" 8 "8" 10 "10" 12 "12" 14 "14" 16 "16") scale(1.5) ///
title("Census Variables") saving(demo_errors, replace) ///
legend(order(1 "Blackbox" 2 "Matching") position(6) col(2))


*** Merge aggregate census data and calculate differences (MAEs)
import delimited "benchmark_surveys_aggregates.csv", clear 
merge 1:1 country variable category using fig3_blackbox_benchmark.dta, nogen
merge 1:1 country variable category using fig3_matching_benchmark.dta, nogen

* create pooled category
expand 2, gen(duplicate)
replace country = "Pooled" if duplicate == 1
replace country = strproper(country)

* calculate absolute differences
gen abs_error_netquest = abs(population - blackbox)
gen abs_error_matching = abs(population - matching)

* keep only response category that is modal response in benchamrk data
keep if modal == 1

* bar graph
graph bar abs_error_netquest abs_error_matching, over(country, label(angle(45))) ///
scheme(plotplain) fxsize(80) ///
ytitle("Mean absolute error") ylab(0 "0" 2 "2" 4 "4" 6 "6" 8 "8" 10 "10" 12 "12" 14 "14" 16 "16") scale(1.5) ///
title("Benchmark Questions") saving(bench_errors, replace) ///
legend(order(1 "Blackbox" 2 "Matching") position(6) col(2))

* combine into figure 3
graph combine bench_errors.gph demo_errors.gph, row(1) scheme(plotplain)
